% finaquant Financial Analytics - www.finaquant.com
% Copyright: Tunc Ali Ktkcoglu  2012, version: 8Mar2012
% TITLE: Simple Time Estimation Model with Linear Regression (LR) in Action
% Related web page:
% http://finaquant.com/predictive-modelling-with-linear-regression/520

% Predictive model for estimating the time record of an athlete (in seconds) 
% running a 100-meter distance against the wind (speed in km/h) with a given 
% weight (in kg) attached to his belt .

% y = run_time_record(x1, x2, t):
% Hypothetical function that describes assumed physical relationships.
% The resultant time record (y) is a function of three parameters:
% - x1: wind speed (km/h)
% - x2: weight (kg)
% - t: temperature (Celcius); optimal running temperature is 20 C
%
% The parameter "temperature" which is not captured by the estimation model
% represents generally all kinds of unknown factors affecting the outcome.
% In that sense, this parameter also represents the uncertainty and error
% in estimations based on input parameters x2 and/or x1.

% Surface plot of the physical function with a constant weight
% wind speed range: 0-40 km/h, temperature range: 0-30 degrees celcius 
[X1, T] = meshgrid(0:0.5:40, 0:0.5:30); 
X2 = 1 * ones(size(X1)); % constant weight, 1 kg
% generate observed (actual) time records
Y = run_time_record(X1, X2, T);
p0 = surfc(X1, T, Y);
axis([0 40 0 30 10 20])
xlabel('wind speed (km/h)');
ylabel('temperature (Celcius)');
zlabel('time record (seconds)');
title('Physical Function: Time-record as a function of wind and temperature');
figure


% Number of data sets N and M for training and test data 
N = 200;
M = 100;

disp('Simple Time Estimation Model with Linear Regression (LR) in Action');

% Generate historical data with N+M observations
disp('Generate TRAINING data');
x1 = 50 * rand(1,N);    % wind speed in km/h
x2 = 4 * rand(1,N);     % weight in kg
t = 10 + 20 * rand(1,N);    % temperature in C
y = run_time_record(x1, x2, t);

% construct matrices
X_train = [ones(1,N); x1; x2];
Y_train = y';

disp('Generate TEST data');
x1 = 50 * rand(1,M);    % wind speed in km/h
x2 = 4 * rand(1,M);     % weight in kg
t = 10 + 20 * rand(1,M);    % temperature in C
y = run_time_record(x1, x2, t);

% construct matrices
X_test = [ones(1,M); x1; x2];
Y_test = y';

disp('******************************************************************');
disp('CASE 1: Single parameter time estimation with wind speed (x1)');
disp('******************************************************************');

% first two rows only
X_train1 = X_train(1:2,:); 
X_test1 = X_test(1:2,:);

disp('Find optimal coefficient vector B with training data');
Bopt = inv(X_train1 * X_train1') * X_train1 * Y_train

disp('Calculate estimated time records');
Ye_train = X_train1' * Bopt;
Ye_test = X_test1' * Bopt;

disp('Calculate training and test error');
E_train = Y_train - Ye_train;
SSE_train = E_train' * E_train;
MSE_train = SSE_train / N

E_test = Y_test - Ye_test;
SSE_test = E_test' * E_test;
MSE_test = SSE_test / M

disp('Show curve fitting (approximation) for test data');
[x1, ind] = sort(X_test(2,:)); 

p1 = plot(x1, Y_test(ind)','+', x1, Ye_test(ind)','r');
xlabel('wind speed (km/h)')
ylabel('time record')
title('Case 1: Observed (x dots) vs estimated (red line) time records')
figure

disp('******************************************************************');
disp('CASE 2: Single parameter time estimation with 2nd degree polynomial regression');
disp('******************************************************************');

% 2nd degree polynomial approximation
% ye = b0 + b1 x x1 + b2 x x1^2
x1 = X_train(2,:);
X_train2 = [X_train(1:2,:); x1 .^ 2];

x1 = X_test(2,:);
X_test2 = [X_test(1:2,:); x1 .^ 2];

disp('Find optimal coefficient vector B with training data');
Bopt = inv(X_train2 * X_train2') * X_train2 * Y_train

disp('Calculate estimated time records');
Ye_train = X_train2' * Bopt;
Ye_test = X_test2' * Bopt;

disp('Calculate training and test error');
E_train = Y_train - Ye_train;
SSE_train = E_train' * E_train;
MSE_train = SSE_train / N

E_test = Y_test - Ye_test;
SSE_test = E_test' * E_test;
MSE_test = SSE_test / M

disp('Show curve fitting (approximation) for test data');
[x1, ind] = sort(X_test(2,:)); 

p2 = plot(x1, Y_test(ind)','+', x1, Ye_test(ind)','r');
xlabel('wind speed (km/h)')
ylabel('time record')
title('Case 2: Observed (x dots) vs estimated (red line) time records')
figure

disp('******************************************************************');
disp('CASE 3: Two parameter time estimation with wind speed (x1) and weight (x2)');
disp('******************************************************************');

disp('Find optimal coefficient vector B with training data');
Bopt = inv(X_train * X_train') * X_train * Y_train

disp('Calculate estimated time records');
Ye_train = X_train' * Bopt;
Ye_test = X_test' * Bopt;

disp('Calculate training and test error');
E_train = Y_train - Ye_train;
SSE_train = E_train' * E_train;
MSE_train = SSE_train / N

E_test = Y_test - Ye_test;
SSE_test = E_test' * E_test;
MSE_test = SSE_test / M

disp('Prepare data for surface plot');
% wind speed range: 0-40 km/h, weight range: 0-4 kg, 
[X1, X2] = meshgrid(0:0.5:40, 0:0.2:4); 
T = 10 + 20 * rand(size(X1));

% generate observed (actual) time records
Y = run_time_record(X1, X2, T);

% generate estimated time records
Ye = Bopt(1) + Bopt(2) * X1 + Bopt(3) * X2;

% surface plot
p3 = surfc(X1, X2, Ye);
axis([0 40 0 4 10 18])
xlabel('wind speed (km/h)');
ylabel('weight (kg)');
zlabel('time record (seconds)');
title('Case 3: Time record estimation');
figure

p4 = surfc(X1, X2, Y);
axis([0 40 0 4 10 18])
xlabel('wind speed (km/h)');
ylabel('weight (kg)');
zlabel('time record (seconds)');
title('Case 3: Observed historical time records');
figure

disp('******************************************************************');
disp('CASE 4: Two parameter time estimation with 2nd degree polynomial regression');
disp('******************************************************************');

% 2nd degree polynomial approximation
% ye = b0 + b1 x x1 + b2 x x1^2 + b3 x x2 + b4 x x2^2

% construct training matrices with 5 rows: 1, x1, x1^2, x2, x2^2
% training data
x1 = X_train(2,:);
x2 = X_train(3,:);
X_train4 = zeros(5,N);
X_train4(1:2,:) = X_train(1:2,:); % first two rows are identical
X_train4(3,:) = x1 .^ 2;
X_train4(4,:) = x2;
X_train4(5,:) = x2.^ 2;
% test data
x1 = X_test(2,:);
x2 = X_test(3,:);
X_test4 = zeros(5,M);
X_test4(1:2,:) = X_test(1:2,:); % first two rows are identical
X_test4(3,:) = x1 .^ 2;
X_test4(4,:) = x2;
X_test4(5,:) = x2.^ 2;

disp('Find optimal coefficient vector B with training data');
Bopt = inv(X_train4 * X_train4') * X_train4 * Y_train

disp('Calculate estimated time records');
Ye_train = X_train4' * Bopt;
Ye_test = X_test4' * Bopt;

disp('Calculate training and test error');
E_train = Y_train - Ye_train;
SSE_train = E_train' * E_train;
MSE_train = SSE_train / N

E_test = Y_test - Ye_test;
SSE_test = E_test' * E_test;
MSE_test = SSE_test / M

disp('Prepare data for surface plot');
% wind speed range: 0-40 km/h, weight range: 0-4 kg, 
[X1, X2] = meshgrid(0:0.5:40, 0:0.2:4); 
T = 10 + 20 * rand(size(X1));

% generate observed (actual) time records
Y = run_time_record(X1, X2, T);

% generate estimated time records
Ye = Bopt(1) + Bopt(2)*X1 + Bopt(3)*(X1.^2) + Bopt(4)*X2 + Bopt(5)*(X2.^2);

% surface plot
surfc(X1, X2, Ye);
axis([0 40 0 4 10 18])
xlabel('wind speed (km/h)');
ylabel('weight (kg)');
zlabel('time record (seconds)');
title('Case 4: Time record estimation');
figure

surfc(X1, X2, Y);
axis([0 40 0 4 10 18])
xlabel('wind speed (km/h)');
ylabel('weight (kg)');
zlabel('time record (seconds)');
title('Case 4: Observed historical time records');

